
Predict Default of Credit Card Clients by customer last months data
This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005. There are 25 variables:
ID: ID of each client
LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
import cufflinks as cf
cf.go_offline()
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
# from plotly import tools
# import plotly.tools as tls
# import squarify
# from mpl_toolkits.basemap import Basemap
# from numpy import array
# from matplotlib import cm
# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()
#Read the data from xls file with sheet nema : Data
data=pd.read_excel('default-of-credit-card-clients.xls', 'Data', index_col=0, na_values=['NA'],)
new_header =data.iloc[0]
data = data[1:]
data.columns = new_header
data.head()
print('Number of rows '+ str(data.shape[0]))
print('Number of cloumns '+ str(data.shape[1]))
data.info()
#correct data "data types"
data = data.astype({"PAY_0":'float',"PAY_2":'float',"PAY_3":'float',"PAY_4":'float',"PAY_5":'float',"PAY_6":'float',"AGE":'float', "BILL_AMT1":'float',"BILL_AMT2":'float',"BILL_AMT3":'float',
"BILL_AMT4":'float',"BILL_AMT5":'float',"BILL_AMT6":'float',"PAY_AMT1":'float',
"PAY_AMT2":'float',"PAY_AMT3":'float',"PAY_AMT4":'float',"PAY_AMT5":'float',"PAY_AMT6":'float'
,"LIMIT_BAL":'float',"default payment next month":'float'})
data.info()
#check data statstics
data.describe()
model_features = data.columns.drop('default payment next month')
model_target = 'default payment next month'
data.nunique()
data.isna().sum()
#check unique values for the catogorical columns
for column in data.select_dtypes(include=['object']).columns:
print(data[column].unique())
#check the duplicaation in the data
duplicates = data[data.duplicated()]
len(duplicates)
#check data balance
data[model_target].value_counts()/data.shape[0]
#check the data ditrebution for the catogorical data
for column in data.select_dtypes(include=['object']).columns:
display(pd.crosstab(index=data[column], columns='% observations', normalize='columns')*100)
#data balance from target prespective
temp = data["default payment next month"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='default payment or not')
temp = data["AGE"]
temp.iplot(kind='histogram', title='Age Distribution')
temp = data["EDUCATION"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='EDUCATION Status ', hole = 0.5)
temp = data["MARRIAGE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='MARRIAGE Status ', hole = 0.5)
plt.figure(figsize = (40,20))
sns.heatmap(data.corr(),annot = True,square = True)
#Remove Duplicated data
clean_data=data.drop_duplicates()
#drop NAN data
clean_data=clean_data.dropna()
#remove the outliers
clean_data=clean_data[clean_data['AGE'] <= 100]
temp = clean_data["AGE"]
temp.iplot(kind='histogram', title='Age Distribution')
#remove the unassigend EDUCATION
clean_data=clean_data[clean_data['EDUCATION'] != 0]
#remove the unassigned MARRIAGE status
clean_data=clean_data[clean_data['MARRIAGE'] != 0]
for column in clean_data.select_dtypes(include=['object']).columns:
print(clean_data[column].unique())
#convert catogerical data to numerical by applying on-hot encoding
categorical_columns =['SEX','EDUCATION','MARRIAGE']
data_dummies = pd.get_dummies(clean_data[categorical_columns], drop_first=True)
clean_data = pd.concat([clean_data, data_dummies], axis = 1)
clean_data.drop(categorical_columns,axis=1, inplace=True)
# crearte another data but with appling minmax scaler
from sklearn import preprocessing
minmax= preprocessing.MinMaxScaler().fit(clean_data)
minmax_clean_data=minmax.transform(clean_data)
minmax_clean_data=pd.DataFrame(minmax_clean_data,columns=list(clean_data))
# crearte another data but with upsampling the data to solve imbalanced data
from sklearn.utils import resample
#minmax_Y = minmax_clean_data['default payment next month'].copy()
#minmax_X = minmax_clean_data[features].copy()
maj_data=minmax_clean_data[minmax_clean_data['default payment next month']==0]
min_data=minmax_clean_data[minmax_clean_data['default payment next month']==1]
min_data_oversample=resample(min_data,replace=True,n_samples=22000,random_state=587)
oversample_minmax_clean_data=pd.concat([maj_data,min_data_oversample])
oversample_minmax_clean_data['default payment next month'].value_counts()
#data balance from target prespective
temp = oversample_minmax_clean_data["default payment next month"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='default payment or not')
features = ['LIMIT_BAL', 'SEX_male', 'EDUCATION_high school','EDUCATION_university','EDUCATION_others', 'MARRIAGE_single', 'AGE', 'PAY_0', 'PAY_2',
'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
#Train/Test data for cleaned dataset
clean_Y = clean_data['default payment next month'].copy()
clean_X = clean_data[features].copy()
clean_X_train, clean_X_test, clean_y_train, clean_y_test = train_test_split(clean_X,clean_Y, test_size=0.20,shuffle=True, random_state=42)
#Train/Test data for cleaned and minmax scaled dataset
minmax_clean_Y = minmax_clean_data['default payment next month'].copy()
minmax_clean_X = minmax_clean_data[features].copy()
minmax_clean_X_train, minmax_clean_X_test, minmax_clean_y_train, minmax_clean_y_test = train_test_split(minmax_clean_X,minmax_clean_Y, test_size=0.20,shuffle=True, random_state=42)
#Train/Test data for cleaned and minmax scaled nand upsampled dataset
oversample_minmax_clean_Y = oversample_minmax_clean_data['default payment next month'].copy()
oversample_minmax_clean_X = oversample_minmax_clean_data[features].copy()
oversample_minmax_clean_X_train, oversample_minmax_clean_X_test, oversample_minmax_clean_y_train, oversample_minmax_clean_y_test = train_test_split(oversample_minmax_clean_X,oversample_minmax_clean_Y, test_size=0.20,shuffle=True, random_state=42)
#DecisionTreeClassifier with tree depth =2 and cleaned data
classifier = DecisionTreeClassifier(max_depth=2, random_state=14)
# fit the classifier
classifier.fit(clean_X_train, clean_y_train)
# test the predictions
predictions = classifier.predict(clean_X_test)
cf=confusion_matrix(clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#DecisionTreeClassifier with tree depth =2 and cleaned-minmax scale data
classifier = DecisionTreeClassifier(max_depth=2, random_state=14)
# fit the classifier
classifier.fit(minmax_clean_X_train, minmax_clean_y_train)
# test the predictions
predictions = classifier.predict(minmax_clean_X_test)
cf=confusion_matrix(minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#DecisionTreeClassifier with tree depth =2 and cleaned-minmax scale and updampled data
classifier = DecisionTreeClassifier(max_depth=2, random_state=14)
# fit the classifier
classifier.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
# test the predictions
predictions = classifier.predict(oversample_minmax_clean_X_test)
accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#DecisionTreeClassifier with tree depth =32 and cleaned-minmax scale and updampled data
classifier = DecisionTreeClassifier(max_depth=32, random_state=14)
# fit the classifier
classifier.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
# test the predictions
predictions = classifier.predict(oversample_minmax_clean_X_test)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
from sklearn import linear_model
#LogisticRegression modle for cleaned data
logreg =linear_model.LogisticRegression()
logreg.fit(clean_X_train, clean_y_train)
# test the predictions
predictions = logreg.predict(minmax_clean_X_test)
accuracy_score(y_true = clean_y_test, y_pred = predictions)
cf=confusion_matrix(clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#LogisticRegression modle for cleaned data
logreg =linear_model.LogisticRegression()
# fit the classifier
logreg.fit(minmax_clean_X_train, minmax_clean_y_train)
# test the predictions
predictions = logreg.predict(minmax_clean_X_test)
cf=confusion_matrix(minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#LogisticRegression modle for cleaned data
logreg =linear_model.LogisticRegression()
# fit the classifier
logreg.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
# test the predictions
predictions = logreg.predict(oversample_minmax_clean_X_test)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
from sklearn.neighbors import KNeighborsClassifier
#LogisticRegression modle for cleaned data
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(clean_X_train, clean_y_train)
# test the predictions
predictions = knn.predict(clean_X_test)
cf=confusion_matrix(clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#LogisticRegression modle for cleaned data
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(minmax_clean_X_train, minmax_clean_y_train)
# test the predictions
predictions = knn.predict(minmax_clean_X_test)
cf=confusion_matrix(minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
#LogisticRegression modle for cleaned data
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
# test the predictions
predictions = knn.predict(oversample_minmax_clean_X_test)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_jobs=10,random_state=10, n_estimators=20, verbose=False)
rf.fit(clean_X_train, clean_y_train)
#test the predictions
predictions = rf.predict(clean_X_test)
cf=confusion_matrix(clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
rf=RandomForestClassifier(n_jobs=10,random_state=10, n_estimators=20, verbose=False)
rf.fit(minmax_clean_X_train, minmax_clean_y_train)
#test the predictions
predictions = rf.predict(minmax_clean_X_test)
cf=confusion_matrix(minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
rf=RandomForestClassifier(n_jobs=10,random_state=20, n_estimators=20, verbose=False)
rf.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
#test the predictions
predictions = rf.predict(oversample_minmax_clean_X_test)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
from xgboost import XGBClassifier
# fit model no training data
xgb = XGBClassifier(use_label_encoder=False,
learning_rate =0.1,
n_estimators=1000,
max_depth=100,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
xgb.fit(clean_X_train, clean_y_train)
#test the predictions
predictions = xgb.predict(clean_X_test)
cf=confusion_matrix(clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
# fit model no training data
xgb = XGBClassifier(use_label_encoder=False,
learning_rate =0.1,
n_estimators=1000,
max_depth=100,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
xgb.fit(minmax_clean_X_train, minmax_clean_y_train)
#test the predictions
predictions = xgb.predict(minmax_clean_X_test)
cf=confusion_matrix(minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')
# fit model no training data
xgb = XGBClassifier(use_label_encoder=False,
learning_rate =0.1,
n_estimators=1000,
max_depth=100,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
xgb.fit(oversample_minmax_clean_X_train, oversample_minmax_clean_y_train)
#test the predictions
predictions = xgb.predict(oversample_minmax_clean_X_test)
cf=confusion_matrix(oversample_minmax_clean_y_test, predictions)
sns.heatmap(cf, annot=True, fmt='.0f', cmap="YlGnBu").set_title('Confusion Matrix')
accuracy = accuracy_score(y_true = oversample_minmax_clean_y_test, y_pred = predictions)
print(f'Accuracy: {accuracy}')
auc = roc_auc_score(oversample_minmax_clean_y_test, predictions)
print(f'AUC : {auc}')
precision, recall, f1_score, _ = precision_recall_fscore_support(oversample_minmax_clean_y_test, predictions, average = 'binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1_score: {f1_score}')